The two data sets used in this analysis are the Madison case data sourced from the Wisconsin DHS and wastewater concentration data produced by the Wisconsin State Laboratory of Hygiene. This wastewater data has entries every couple of days from November 15 2020 to June 24 2021.
| Date | Site | Cases | SevenDayMACases | N1 | BCoV | N2 | PMMoV | GeoMeanN12 | FlowRate | temperature | equiv_sewage_amt |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2020-09-11 | Madison | 153 | 192.2857 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-12 | Madison | 181 | 194.0000 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-13 | Madison | 171 | 158.7143 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-14 | Madison | 96 | 154.7143 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-15 | Madison | 69 | 153.5714 | NA | NA | NA | NA | NA | NA | NA | NA |
| 2020-09-16 | Madison | 231 | 148.1429 | NA | NA | NA | NA | NA | NA | NA | NA |
The case data has a strong weekend effect so for this section we look at a seven day smoothing of cases. The simple display of the data shows the core components of this story. First that wastewater data is very noisy. And that there is a hint of a relationship between the two signals.
FirstImpressionDF <- FullDF%>%
NoNa("N1","Cases")
FirstImpression <- FirstImpressionDF%>%#Removing outliers
ggplot(aes(x=Date))+#Data depends on time
geom_line(aes(y=N1, color="N1",info=N1))+#compares N1 to Cases
geom_line(aes(y=MinMaxFixing(PMMoV,N1), color="PMMoV",info=PMMoV))+
geom_line(aes(y=MinMaxFixing(FlowRate,N1), color="FlowRate",info=FlowRate))+
geom_line(aes(y=MinMaxFixing(BCoV,N1), color="BCoV",info=BCoV))+
#geom_line(aes(y=MinMaxFixing(temperature,N1), color="temperature",info=temperature))+
#geom_line(aes(y=MinMaxFixing(tss,N1), color="tss",info=tss))+
geom_line(aes(y=MinMaxFixing(equiv_sewage_amt,N1), color="equiv_sewage_amt",info=equiv_sewage_amt))+
geom_line(aes(y=MinMaxFixing(SevenDayMACases,N1), color="Seven Day MA Cases",info=Cases))+
geom_line(aes(y=MinMaxFixing(N2,N1), color="N2",info=N2))+
labs(y="N1")
ggplotly(FirstImpression)
#To remove weekend effects we are looking at the 7 day smoothing of cases.
DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend
ErrorMarkedDF <- FullDF%>%#Remove older Var data that clearly has no relationship to Cases
mutate(EarlyOutlier = ifelse(Date < mdy("11/20/2020"),TRUE,FALSE))#replace old.
#"11/20/2020"
#"1/1/2021"
ErrorMarkedDF$DetectedOutlierN1 <- IdentifyOutliers(ErrorMarkedDF$N1)
ErrorMarkedDF$DetectedOutlierN2 <- IdentifyOutliers(ErrorMarkedDF$N2)
ErrorMarkedDF$DetectedOutlierGeo <- IdentifyOutliers(ErrorMarkedDF$GeoMeanN12)
ErrorMarkedDF$DetectedOutlierPMMOV <- IdentifyOutliers(ErrorMarkedDF$PMMoV)
ErrorMarkedDF$DetectedOutlierBCOv <- IdentifyOutliers(ErrorMarkedDF$BCoV)
ErrorRemovedDF <- ErrorMarkedDF%>%
mutate(PotentialOutlier = Date>mdy("03/31/2021")&mdy("05/12/2021")>Date)%>%
select(-EarlyOutlier)#Removing unneeded calculated columns
ErrorMarkedDF%>%
select(-EarlyOutlier)%>%
#filter(DetectedOutlierN1|DetectedOutlierN2|DetectedOutlierGeo)%>%
mutate(NumberOfOutliers = DetectedOutlierN1+DetectedOutlierN2+DetectedOutlierGeo)%>%
group_by(NumberOfOutliers)%>%
summarize(n())
## # A tibble: 4 x 2
## NumberOfOutliers `n()`
## <int> <int>
## 1 0 469
## 2 1 7
## 3 2 5
## 4 3 7
ErrorMarkedDF%>%
select(-EarlyOutlier)%>%
summarize(N1 = sum(DetectedOutlierN1), N2 = sum(DetectedOutlierN2), Geo = sum(DetectedOutlierGeo), N12 =sum(DetectedOutlierN1*DetectedOutlierN2), N1Geo =sum(DetectedOutlierN1*DetectedOutlierGeo), N2Geo =sum(DetectedOutlierN2*DetectedOutlierGeo), AllVars =sum(DetectedOutlierN1*DetectedOutlierN2*DetectedOutlierGeo))
## N1 N2 Geo N12 N1Geo N2Geo AllVars
## 1 10 16 12 7 9 10 7
OutlierComp <- ErrorMarkedDF%>%
filter(DetectedOutlierN1|DetectedOutlierN2|DetectedOutlierGeo)%>%
mutate(Name = "",Name = paste(Name,ifelse(DetectedOutlierN1,"N1","")),
Name = paste(Name,ifelse(DetectedOutlierN2,"N2","")),
Name = paste(Name,ifelse(DetectedOutlierGeo,"Geo","")))%>%
ggplot(aes(x=Date))+
geom_point(aes(y=N1,shape="N1",color=Name))+
geom_point(aes(y=N2,shape="N2",color=Name))+
geom_point(aes(y=GeoMeanN12,shape="Geo",color=Name))
ggplotly(OutlierComp)
#,info1=N1,info2=GeoMeanN12
OutlierComp <- ErrorMarkedDF%>%
filter(DetectedOutlierN1|DetectedOutlierPMMOV|DetectedOutlierBCOv)%>%
mutate(Name = "",Name = paste(Name,ifelse(DetectedOutlierN1,"N1","")),
Name = paste(Name,ifelse(DetectedOutlierPMMOV,"PMMoV","")),
Name = paste(Name,ifelse(DetectedOutlierBCOv,"BCoV","")))%>%
ggplot(aes(x=Date))+
geom_point(aes(y=N1,color=Name,info1=PMMoV,info2=BCoV))
ggplotly(OutlierComp)
#ErrorMarkedDF%>%
# select(-EarlyOutlier)%>%
# write.csv(file="OutlierFlagDF")
#wwtp_comments
#analytical_comments
#ErrorRemovedDF%>%
# filter(!is.na(analytical_comments))
CoVarGraphic <- ErrorRemovedDF%>%
ggplot()+#LargeError
aes(y = N1,color=DetectedOutlier,info = Date)
#PotentialOutlier#DetectedOutlier
CoVarGraphicPMMoV <- CoVarGraphic+
geom_point(aes(x = PMMoV))
#Outliers
#26203622#29931423
#149752.0#461781.5
#Potintial outlier
#26463265#30048370
#172041.0#98442.5
CoVarGraphicFlowRate <- CoVarGraphic+
geom_point(aes(x = FlowRate))
CoVarGraphicBCoV <- CoVarGraphic+
geom_point(aes(x = BCoV))
CoVarGraphicequiv_sewage_amt <- CoVarGraphic+#temperature,equiv_sewage_amt
geom_point(aes(x = equiv_sewage_amt))
#PMMoV,FlowRate,BCoV,temperature,equiv_sewage_amt
ggplotly(CoVarGraphicPMMoV)
ggplotly(CoVarGraphicBCoV)
ggplotly(CoVarGraphicFlowRate)
ggplotly(CoVarGraphicequiv_sewage_amt)
ErrorRemovedDF%>%
group_by(DetectedOutlier)%>%
summarize(median(PMMoV,na.rm = TRUE),sd(PMMoV,na.rm = TRUE),
median(N1,na.rm = TRUE),sd(N1,na.rm = TRUE))
AA <- ErrorRemovedDF%>%
ggplot(aes(x=Date,y=equiv_sewage_amt))+
geom_point()
ggplotly(AA)
summary(lm(DetectedOutlier ~ PMMoV ,data = ErrorMarkedDF))
library(rpart)
fit <- rpart(DetectedOutlier ~ PMMoV + N1,
method="class", data=ErrorMarkedDF)
printcp(fit)
plotcp(fit)
summary(fit)
plot(fit)
text(fit, use.n=TRUE, all=TRUE, cex=.5)
CoVarGraphicPMMoV <- CoVarGraphic+
geom_point(aes(x = PMMoV))+
geom_hline(yintercept = 5.61e5)+
geom_vline(xintercept = 2.82e7)
ggplotly(CoVarGraphicPMMoV)
ErrorMarkedDF%>%
ggplot(aes(x = DetectedOutlier, y = log(PMMoV))) +
geom_boxplot()
t.test(PMMoV ~ DetectedOutlier, data = ErrorMarkedDF)
#26203622#29931423
#149752.0#461781.5